In [ ]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import when
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import approx_count_distinct
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

Criando Sessao e Lendo CSV¶

In [ ]:
spark = SparkSession.builder \
    .appName("Session Atividade 2") \
    .getOrCreate()

customer = spark.read.csv("/user/customer.csv", inferSchema=True, header=True, sep=";")

Previa do CSV¶

In [ ]:
customer.show()
+-----------+--------+---------+-----+---------+---+--------------------+-------------+---+------------+------+---------------+-------------+---------------+----------------+--------------+--------------+----------+---------------+----------------------+-----------------------+----------------------+---------------+-----------+-----------+--------------+------------------+-------------+----------------+--------+---------+
|CUSTOMER_ID|    LAST|    FIRST|STATE|   REGION|SEX|          PROFESSION|BUY_INSURANCE|AGE|HAS_CHILDREN|SALARY|N_OF_DEPENDENTS|CAR_OWNERSHIP|HOUSE_OWNERSHIP|TIME_AS_CUSTOMER|MARITAL_STATUS|CREDIT_BALANCE|BANK_FUNDS|CHECKING_AMOUNT|MONEY_MONTLY_OVERDRAWN|T_AMOUNT_AUTOM_PAYMENTS|MONTHLY_CHECKS_WRITTEN|MORTGAGE_AMOUNT|N_TRANS_ATM|N_MORTGAGES|N_TRANS_TELLER|CREDIT_CARD_LIMITS|N_TRANS_KIOSK|N_TRANS_WEB_BANK|     LTV|  LTV_BIN|
+-----------+--------+---------+-----+---------+---+--------------------+-------------+---+------------+------+---------------+-------------+---------------+----------------+--------------+--------------+----------+---------------+----------------------+-----------------------+----------------------+---------------+-----------+-----------+--------------+------------------+-------------+----------------+--------+---------+
|     CU8589|    KALA|  IVERSON|   WI|  Midwest|  F|              PROF-9|           No| 49|           1| 68696|              1|            1|              1|               1|       WIDOWED|             0|     16100|             25|                 53,14|                   1749|                     4|           5500|          2|          1|             5|               800|            1|            3700|   25574|     HIGH|
|     CU9823|  NESTOR|   HORTON|   CA|     West|  M|               Nurse|           No| 24|           0| 73850|              0|            1|              0|               2|        SINGLE|             0|         0|             25|                 53,06|                    504|                     0|              0|          0|          0|             0|              1500|            1|               0| 21862,5|   MEDIUM|
|    CU12175|  DWIGHT|  ROLLINS|   MI|  Midwest|  M|Programmer/Developer|           No| 26|           1| 60249|              2|            1|              1|               2|       MARRIED|           452|       500|            134|                 53,42|                    625|                     4|           1036|          4|          1|             2|              1000|            4|            1036|19662,25|   MEDIUM|
|     CU7924|   GRADY|   STATON|   NY|NorthEast|  M|Programmer/Developer|           No| 32|           0| 60466|              1|            1|              0|               1|        SINGLE|             0|       650|            265|                 53,18|                    278|                    17|              0|          3|          0|             2|               700|            3|               0| 16816,5|   MEDIUM|
|    CU14284|DOMINICK|      LAW|   NY|NorthEast|  M|Construction Laborer|           No| 24|           0| 76570|              3|            1|              1|               3|        SINGLE|             0|         0|             25|                 53,06|                      0|                     1|            358|          0|          1|             0|              1500|            1|             358| 27042,5|     HIGH|
|    CU14620|  DANIAL|   CORLEY|   UT|Southwest|  M|        Truck Driver|           No| 35|           1| 62756|              1|            1|              1|               1|       MARRIED|             0|       501|           1382|                 53,32|                   1636|                     8|           1500|          3|          1|             2|               500|            1|            1500|   22689|     HIGH|
|    CU15186|  VERNON|    NIETO|   NY|NorthEast|  F|            IT Staff|           No| 36|           1| 62886|              1|            1|              1|               1|       MARRIED|             0|       600|             25|                 53,22|                    757|                     2|           1020|          3|          1|             2|              1000|            1|            1020| 22821,5|     HIGH|
|     CU5165|   KITTY|   COFFEY|   MI|  Midwest|  F|        Truck Driver|           No| 26|           1| 61012|              1|            1|              1|               1|       MARRIED|             0|      2400|           1314|                 53,27|                   2299|                     3|           1300|          3|          1|             1|              1100|            0|            1300|   21353|   MEDIUM|
|     CU5938|   TEDDY|  MURILLO|   CA|     West|  M|              Author|           No| 78|           1| 65134|              0|            1|              0|               2|        SINGLE|             0|         0|             25|                 53,08|                    514|                     4|              0|          1|          0|             0|               900|            0|               0| 20083,5|   MEDIUM|
|     CU3296| MARYLEE| HATHAWAY|   CA|     West|  F|       Not specified|           No| 49|           1| 60322|              3|            1|              1|               4|       WIDOWED|         30650|      9550|            266|                 53,73|                  33850|                    12|           6500|          4|          1|             4|               900|            2|            1700| 20980,5|   MEDIUM|
|     CU1069| THURMAN|  VAUGHAN|   MN|     West|  M|             PROF-15|           No| 80|           1| 63067|              0|            1|              0|               2|        SINGLE|             0|         0|             25|                 53,06|                    504|                     0|              0|          0|          0|             1|              1400|            0|               0|19766,75|   MEDIUM|
|    CU13543|   DARYL|  ADDISON|   WA|     West|  F|             PROF-39|           No| 52|           1| 73090|              5|            1|              2|               5|      DIVORCED|             0|      1750|             25|                 53,33|                    249|                    10|           2728|          3|          2|             3|              1500|            1|            2728| 30972,5|VERY HIGH|
|    CU11717|  LAVETA|    CASEY|   NY|NorthEast|  F|           Publisher|           No| 31|           0| 71782|              5|            1|              1|               5|      DIVORCED|             0|         0|             34|                 53,02|                      0|                     0|           2200|          2|          1|             1|              1500|            1|            2200| 23545,5|     HIGH|
|     CU5928| ROBERTO|CORNELIUS|   NY|NorthEast|  M|    Childcare Worker|           No| 44|           1| 61056|              1|            1|              1|               1|      DIVORCED|             0|         0|            461|                 53,15|                    965|                     0|           2200|          2|          1|             1|              1000|            0|            2200|   23164|     HIGH|
|    CU10012|  DORSEY|   PRIEST|   NM|Southwest|  M|             Cashier|           No| 52|           0| 59480|              3|            1|              1|               4|       MARRIED|             0|         0|             25|                 53,06|                      0|                     0|           1001|          0|          1|             0|               700|            1|            1001|   26070|     HIGH|
|      CU197| ENRIQUE|  BARRIOS|   MI|  Midwest|  M|               Nurse|           No| 77|           0| 73461|              0|            1|              0|               2|        SINGLE|             0|         0|             25|                 53,06|                    598|                     0|              0|          0|          0|             0|              2500|            0|               0|27065,25|     HIGH|
|      CU476|   CHONG|  CROWDER|   CA|     West|  M|Administrative As...|           No| 69|           1| 61943|              0|            1|              1|               2|       MARRIED|             0|         0|           1466|                 53,12|                   1971|                     0|            909|          2|          1|             1|               600|            0|             909|28385,75|     HIGH|
|     CU9110|    JOHN|    CLARK|   DC|NorthEast|  M|Administrative As...|           No| 46|           1| 64648|              0|            1|              1|               2|      DIVORCED|             0|         0|             25|                 53,06|                      0|                     0|           2500|          0|          1|             1|               800|            1|            2500|   26762|     HIGH|
|     CU7291|  LAUREN|    LAYNE|   MI|  Midwest|  M|               Nurse|           No| 42|           1| 58327|              0|            1|              0|               4|        SINGLE|             0|      5100|           1926|                 53,45|                   2570|                     4|              0|          4|          0|             2|               800|            1|            1500|15781,75|   MEDIUM|
|     CU7148| VINCENT|     COBB|   OK|  Midwest|  M|Programmer/Developer|           No| 28|           0| 60968|              2|            1|              1|               2|       MARRIED|             0|       750|            206|                 53.00|                    725|                     3|            700|          2|          1|             3|               900|            1|             700|   25042|     HIGH|
+-----------+--------+---------+-----+---------+---+--------------------+-------------+---+------------+------+---------------+-------------+---------------+----------------+--------------+--------------+----------+---------------+----------------------+-----------------------+----------------------+---------------+-----------+-----------+--------------+------------------+-------------+----------------+--------+---------+
only showing top 20 rows

Target = BUY_INSURANCE, aplicar Arvore de decisão¶

Verificando tipos¶

In [ ]:
customer.printSchema()
root
 |-- CUSTOMER_ID: string (nullable = true)
 |-- LAST: string (nullable = true)
 |-- FIRST: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- REGION: string (nullable = true)
 |-- SEX: string (nullable = true)
 |-- PROFESSION: string (nullable = true)
 |-- BUY_INSURANCE: string (nullable = true)
 |-- AGE: integer (nullable = true)
 |-- HAS_CHILDREN: integer (nullable = true)
 |-- SALARY: integer (nullable = true)
 |-- N_OF_DEPENDENTS: integer (nullable = true)
 |-- CAR_OWNERSHIP: integer (nullable = true)
 |-- HOUSE_OWNERSHIP: integer (nullable = true)
 |-- TIME_AS_CUSTOMER: integer (nullable = true)
 |-- MARITAL_STATUS: string (nullable = true)
 |-- CREDIT_BALANCE: integer (nullable = true)
 |-- BANK_FUNDS: integer (nullable = true)
 |-- CHECKING_AMOUNT: integer (nullable = true)
 |-- MONEY_MONTLY_OVERDRAWN: string (nullable = true)
 |-- T_AMOUNT_AUTOM_PAYMENTS: integer (nullable = true)
 |-- MONTHLY_CHECKS_WRITTEN: integer (nullable = true)
 |-- MORTGAGE_AMOUNT: integer (nullable = true)
 |-- N_TRANS_ATM: integer (nullable = true)
 |-- N_MORTGAGES: integer (nullable = true)
 |-- N_TRANS_TELLER: integer (nullable = true)
 |-- CREDIT_CARD_LIMITS: integer (nullable = true)
 |-- N_TRANS_KIOSK: integer (nullable = true)
 |-- N_TRANS_WEB_BANK: integer (nullable = true)
 |-- LTV: string (nullable = true)
 |-- LTV_BIN: string (nullable = true)

Foram encontrados dados numericos como String, fazendo a correcao:¶

In [ ]:
customer = customer.withColumn("MONEY_MONTLY_OVERDRAWN", regexp_replace(col("MONEY_MONTLY_OVERDRAWN"), ",", ".").cast("float"))
customer = customer.withColumn("LTV", regexp_replace(col("LTV"), ",", ".").cast("float"))
In [ ]:
customer.show()
+-----------+--------+---------+-----+---------+---+--------------------+-------------+---+------------+------+---------------+-------------+---------------+----------------+--------------+--------------+----------+---------------+----------------------+-----------------------+----------------------+---------------+-----------+-----------+--------------+------------------+-------------+----------------+--------+---------+
|CUSTOMER_ID|    LAST|    FIRST|STATE|   REGION|SEX|          PROFESSION|BUY_INSURANCE|AGE|HAS_CHILDREN|SALARY|N_OF_DEPENDENTS|CAR_OWNERSHIP|HOUSE_OWNERSHIP|TIME_AS_CUSTOMER|MARITAL_STATUS|CREDIT_BALANCE|BANK_FUNDS|CHECKING_AMOUNT|MONEY_MONTLY_OVERDRAWN|T_AMOUNT_AUTOM_PAYMENTS|MONTHLY_CHECKS_WRITTEN|MORTGAGE_AMOUNT|N_TRANS_ATM|N_MORTGAGES|N_TRANS_TELLER|CREDIT_CARD_LIMITS|N_TRANS_KIOSK|N_TRANS_WEB_BANK|     LTV|  LTV_BIN|
+-----------+--------+---------+-----+---------+---+--------------------+-------------+---+------------+------+---------------+-------------+---------------+----------------+--------------+--------------+----------+---------------+----------------------+-----------------------+----------------------+---------------+-----------+-----------+--------------+------------------+-------------+----------------+--------+---------+
|     CU8589|    KALA|  IVERSON|   WI|  Midwest|  F|              PROF-9|           No| 49|           1| 68696|              1|            1|              1|               1|       WIDOWED|             0|     16100|             25|                 53.14|                   1749|                     4|           5500|          2|          1|             5|               800|            1|            3700| 25574.0|     HIGH|
|     CU9823|  NESTOR|   HORTON|   CA|     West|  M|               Nurse|           No| 24|           0| 73850|              0|            1|              0|               2|        SINGLE|             0|         0|             25|                 53.06|                    504|                     0|              0|          0|          0|             0|              1500|            1|               0| 21862.5|   MEDIUM|
|    CU12175|  DWIGHT|  ROLLINS|   MI|  Midwest|  M|Programmer/Developer|           No| 26|           1| 60249|              2|            1|              1|               2|       MARRIED|           452|       500|            134|                 53.42|                    625|                     4|           1036|          4|          1|             2|              1000|            4|            1036|19662.25|   MEDIUM|
|     CU7924|   GRADY|   STATON|   NY|NorthEast|  M|Programmer/Developer|           No| 32|           0| 60466|              1|            1|              0|               1|        SINGLE|             0|       650|            265|                 53.18|                    278|                    17|              0|          3|          0|             2|               700|            3|               0| 16816.5|   MEDIUM|
|    CU14284|DOMINICK|      LAW|   NY|NorthEast|  M|Construction Laborer|           No| 24|           0| 76570|              3|            1|              1|               3|        SINGLE|             0|         0|             25|                 53.06|                      0|                     1|            358|          0|          1|             0|              1500|            1|             358| 27042.5|     HIGH|
|    CU14620|  DANIAL|   CORLEY|   UT|Southwest|  M|        Truck Driver|           No| 35|           1| 62756|              1|            1|              1|               1|       MARRIED|             0|       501|           1382|                 53.32|                   1636|                     8|           1500|          3|          1|             2|               500|            1|            1500| 22689.0|     HIGH|
|    CU15186|  VERNON|    NIETO|   NY|NorthEast|  F|            IT Staff|           No| 36|           1| 62886|              1|            1|              1|               1|       MARRIED|             0|       600|             25|                 53.22|                    757|                     2|           1020|          3|          1|             2|              1000|            1|            1020| 22821.5|     HIGH|
|     CU5165|   KITTY|   COFFEY|   MI|  Midwest|  F|        Truck Driver|           No| 26|           1| 61012|              1|            1|              1|               1|       MARRIED|             0|      2400|           1314|                 53.27|                   2299|                     3|           1300|          3|          1|             1|              1100|            0|            1300| 21353.0|   MEDIUM|
|     CU5938|   TEDDY|  MURILLO|   CA|     West|  M|              Author|           No| 78|           1| 65134|              0|            1|              0|               2|        SINGLE|             0|         0|             25|                 53.08|                    514|                     4|              0|          1|          0|             0|               900|            0|               0| 20083.5|   MEDIUM|
|     CU3296| MARYLEE| HATHAWAY|   CA|     West|  F|       Not specified|           No| 49|           1| 60322|              3|            1|              1|               4|       WIDOWED|         30650|      9550|            266|                 53.73|                  33850|                    12|           6500|          4|          1|             4|               900|            2|            1700| 20980.5|   MEDIUM|
|     CU1069| THURMAN|  VAUGHAN|   MN|     West|  M|             PROF-15|           No| 80|           1| 63067|              0|            1|              0|               2|        SINGLE|             0|         0|             25|                 53.06|                    504|                     0|              0|          0|          0|             1|              1400|            0|               0|19766.75|   MEDIUM|
|    CU13543|   DARYL|  ADDISON|   WA|     West|  F|             PROF-39|           No| 52|           1| 73090|              5|            1|              2|               5|      DIVORCED|             0|      1750|             25|                 53.33|                    249|                    10|           2728|          3|          2|             3|              1500|            1|            2728| 30972.5|VERY HIGH|
|    CU11717|  LAVETA|    CASEY|   NY|NorthEast|  F|           Publisher|           No| 31|           0| 71782|              5|            1|              1|               5|      DIVORCED|             0|         0|             34|                 53.02|                      0|                     0|           2200|          2|          1|             1|              1500|            1|            2200| 23545.5|     HIGH|
|     CU5928| ROBERTO|CORNELIUS|   NY|NorthEast|  M|    Childcare Worker|           No| 44|           1| 61056|              1|            1|              1|               1|      DIVORCED|             0|         0|            461|                 53.15|                    965|                     0|           2200|          2|          1|             1|              1000|            0|            2200| 23164.0|     HIGH|
|    CU10012|  DORSEY|   PRIEST|   NM|Southwest|  M|             Cashier|           No| 52|           0| 59480|              3|            1|              1|               4|       MARRIED|             0|         0|             25|                 53.06|                      0|                     0|           1001|          0|          1|             0|               700|            1|            1001| 26070.0|     HIGH|
|      CU197| ENRIQUE|  BARRIOS|   MI|  Midwest|  M|               Nurse|           No| 77|           0| 73461|              0|            1|              0|               2|        SINGLE|             0|         0|             25|                 53.06|                    598|                     0|              0|          0|          0|             0|              2500|            0|               0|27065.25|     HIGH|
|      CU476|   CHONG|  CROWDER|   CA|     West|  M|Administrative As...|           No| 69|           1| 61943|              0|            1|              1|               2|       MARRIED|             0|         0|           1466|                 53.12|                   1971|                     0|            909|          2|          1|             1|               600|            0|             909|28385.75|     HIGH|
|     CU9110|    JOHN|    CLARK|   DC|NorthEast|  M|Administrative As...|           No| 46|           1| 64648|              0|            1|              1|               2|      DIVORCED|             0|         0|             25|                 53.06|                      0|                     0|           2500|          0|          1|             1|               800|            1|            2500| 26762.0|     HIGH|
|     CU7291|  LAUREN|    LAYNE|   MI|  Midwest|  M|               Nurse|           No| 42|           1| 58327|              0|            1|              0|               4|        SINGLE|             0|      5100|           1926|                 53.45|                   2570|                     4|              0|          4|          0|             2|               800|            1|            1500|15781.75|   MEDIUM|
|     CU7148| VINCENT|     COBB|   OK|  Midwest|  M|Programmer/Developer|           No| 28|           0| 60968|              2|            1|              1|               2|       MARRIED|             0|       750|            206|                  53.0|                    725|                     3|            700|          2|          1|             3|               900|            1|             700| 25042.0|     HIGH|
+-----------+--------+---------+-----+---------+---+--------------------+-------------+---+------------+------+---------------+-------------+---------------+----------------+--------------+--------------+----------+---------------+----------------------+-----------------------+----------------------+---------------+-----------+-----------+--------------+------------------+-------------+----------------+--------+---------+
only showing top 20 rows

Fazendo um Describe dos dados¶

In [ ]:
customer.describe().show()
+-------+-----------+-----+-------+-----+-------+----+---------------+-------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+--------------+------------------+-----------------+------------------+----------------------+-----------------------+----------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+---------+
|summary|CUSTOMER_ID| LAST|  FIRST|STATE| REGION| SEX|     PROFESSION|BUY_INSURANCE|               AGE|      HAS_CHILDREN|           SALARY|  N_OF_DEPENDENTS|     CAR_OWNERSHIP|   HOUSE_OWNERSHIP|  TIME_AS_CUSTOMER|MARITAL_STATUS|    CREDIT_BALANCE|       BANK_FUNDS|   CHECKING_AMOUNT|MONEY_MONTLY_OVERDRAWN|T_AMOUNT_AUTOM_PAYMENTS|MONTHLY_CHECKS_WRITTEN|   MORTGAGE_AMOUNT|       N_TRANS_ATM|       N_MORTGAGES|    N_TRANS_TELLER|CREDIT_CARD_LIMITS|     N_TRANS_KIOSK|  N_TRANS_WEB_BANK|              LTV|  LTV_BIN|
+-------+-----------+-----+-------+-----+-------+----+---------------+-------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+--------------+------------------+-----------------+------------------+----------------------+-----------------------+----------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+---------+
|  count|       1015| 1015|   1015| 1015|   1015|1015|           1015|         1015|              1015|              1015|             1015|             1015|              1015|              1015|              1015|          1015|              1015|             1015|              1015|                  1015|                   1015|                  1015|              1015|              1015|              1015|              1015|              1015|              1015|              1015|             1015|     1015|
|   mean|       NULL| NULL|   NULL| NULL|   NULL|NULL|           NULL|         NULL|38.190147783251234|0.5113300492610837|65102.80295566502|1.993103448275862|0.9467980295566503|0.8049261083743843|2.4285714285714284|          NULL|2234.1014778325125|2639.839408866995| 1055.848275862069|    53.708157882314595|      4980.337931034483|     4.311330049261084|2066.1221674876847| 2.826600985221675|0.8049261083743843|1.7310344827586206| 1285.615763546798|1.8640394088669952|1449.6758620689654|22452.39039408867|     NULL|
| stddev|       NULL| NULL|   NULL| NULL|   NULL|NULL|           NULL|         NULL|14.918394106597274|0.5001180377766468|6848.442846277457|1.549623522433759|0.2245466473592848|0.5077072350739917|1.2333992047170599|          NULL|11727.873803601524|4996.100695402684|3124.4059848269044|    1.6850125888497685|       20443.6578124019|     4.817335938750649| 3184.931347333881|1.8914011204972785|0.5077072350739917|1.4993662085725994| 858.2066930270321|1.8260760353304921|2428.9208272335786|6579.356570077857|     NULL|
|    min|      CU100|AARON|ABRAHAM|   AK|Midwest|   F|     Accountant|           No|                 0|                 0|            37572|                0|                 0|                 0|                 1|      DIVORCED|                 0|                0|                25|                 32.16|                      0|                     0|                 0|                 0|                 0|                 0|               500|                 0|                 0|              0.0|     HIGH|
|    max|     CU9988| ZACK| ZUNIGA|   WI|   West|   M|Waiter/Waitress|          Yes|                84|                 1|           109943|                6|                 1|                 2|                 5|       WIDOWED|            170498|            36000|             23476|                 73.61|                 499362|                    18|             45000|                 8|                 2|                 9|              5000|                10|             45000|         43101.25|VERY HIGH|
+-------+-----------+-----+-------+-----+-------+----+---------------+-------------+------------------+------------------+-----------------+-----------------+------------------+------------------+------------------+--------------+------------------+-----------------+------------------+----------------------+-----------------------+----------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+---------+

In [ ]:
unique_value_counts = {}
for col in customer.columns:
    count = customer.select(approx_count_distinct(col)).collect()[0][0]
    unique_value_counts[col] = count
for col, count in unique_value_counts.items():
    print(f"Column '{col}' has {count} unique values.")
Column 'CUSTOMER_ID' has 1065 unique values.
Column 'LAST' has 852 unique values.
Column 'FIRST' has 857 unique values.
Column 'STATE' has 22 unique values.
Column 'REGION' has 5 unique values.
Column 'SEX' has 2 unique values.
Column 'PROFESSION' has 97 unique values.
Column 'BUY_INSURANCE' has 2 unique values.
Column 'AGE' has 65 unique values.
Column 'HAS_CHILDREN' has 2 unique values.
Column 'SALARY' has 1026 unique values.
Column 'N_OF_DEPENDENTS' has 7 unique values.
Column 'CAR_OWNERSHIP' has 2 unique values.
Column 'HOUSE_OWNERSHIP' has 3 unique values.
Column 'TIME_AS_CUSTOMER' has 5 unique values.
Column 'MARITAL_STATUS' has 5 unique values.
Column 'CREDIT_BALANCE' has 95 unique values.
Column 'BANK_FUNDS' has 272 unique values.
Column 'CHECKING_AMOUNT' has 347 unique values.
Column 'MONEY_MONTLY_OVERDRAWN' has 279 unique values.
Column 'T_AMOUNT_AUTOM_PAYMENTS' has 679 unique values.
Column 'MONTHLY_CHECKS_WRITTEN' has 19 unique values.
Column 'MORTGAGE_AMOUNT' has 244 unique values.
Column 'N_TRANS_ATM' has 9 unique values.
Column 'N_MORTGAGES' has 3 unique values.
Column 'N_TRANS_TELLER' has 9 unique values.
Column 'CREDIT_CARD_LIMITS' has 24 unique values.
Column 'N_TRANS_KIOSK' has 11 unique values.
Column 'N_TRANS_WEB_BANK' has 228 unique values.
Column 'LTV' has 1004 unique values.
Column 'LTV_BIN' has 4 unique values.

Removendo dados nao relevantes¶

In [ ]:
columns_to_drop = ["CUSTOMER_ID", "LAST", "FIRST", "PROFESSION", "STATE"]  # Add columns you want to drop here
customer = customer.drop(*columns_to_drop)

Verificando dados nulos¶

In [ ]:
from pyspark.sql.functions import col

null_counts = [(col_name, customer.where(col(col_name).isNull()).count()) for col_name in customer.columns]
for col_name, count in null_counts:
    print(f"{col_name}: {count}")
REGION: 0
SEX: 0
BUY_INSURANCE: 0
AGE: 0
HAS_CHILDREN: 0
SALARY: 0
N_OF_DEPENDENTS: 0
CAR_OWNERSHIP: 0
HOUSE_OWNERSHIP: 0
TIME_AS_CUSTOMER: 0
MARITAL_STATUS: 0
CREDIT_BALANCE: 0
BANK_FUNDS: 0
CHECKING_AMOUNT: 0
MONEY_MONTLY_OVERDRAWN: 0
T_AMOUNT_AUTOM_PAYMENTS: 0
MONTHLY_CHECKS_WRITTEN: 0
MORTGAGE_AMOUNT: 0
N_TRANS_ATM: 0
N_MORTGAGES: 0
N_TRANS_TELLER: 0
CREDIT_CARD_LIMITS: 0
N_TRANS_KIOSK: 0
N_TRANS_WEB_BANK: 0
LTV: 0
LTV_BIN: 0

Sem dados Nulos!¶

In [ ]:
var_categoricas = ['REGION', 'SEX', 'MARITAL_STATUS', 'LTV_BIN']
indexers = [
    StringIndexer(inputCol=column, outputCol=column+"_index").fit(customer)
    for column in var_categoricas
]
pipeline = Pipeline(stages=indexers)
customer = pipeline.fit(customer).transform(customer)
customer = customer.drop(*var_categoricas)

feature_cols = customer.columns.copy()
feature_cols.remove("BUY_INSURANCE")

vector_assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

customer = vector_assembler.transform(customer)

customer = customer.select(
    col("features").alias("features"),
    col("BUY_INSURANCE").alias("label"))


customer = customer.withColumn("label", when(customer["label"] == "No", 0).otherwise(1))
customer.show()
+--------------------+-----+
|            features|label|
+--------------------+-----+
|[49.0,1.0,68696.0...|    0|
|(25,[0,2,4,6,9,10...|    0|
|[26.0,1.0,60249.0...|    0|
|[32.0,0.0,60466.0...|    0|
|(25,[0,2,3,4,5,6,...|    0|
|[35.0,1.0,62756.0...|    0|
|[36.0,1.0,62886.0...|    0|
|[26.0,1.0,61012.0...|    0|
|(25,[0,1,2,4,6,9,...|    0|
|[49.0,1.0,60322.0...|    0|
|(25,[0,1,2,4,6,9,...|    0|
|[52.0,1.0,73090.0...|    0|
|[31.0,0.0,71782.0...|    0|
|[44.0,1.0,61056.0...|    0|
|[52.0,0.0,59480.0...|    0|
|(25,[0,2,4,6,9,10...|    0|
|[69.0,1.0,61943.0...|    0|
|[46.0,1.0,64648.0...|    0|
|[42.0,1.0,58327.0...|    0|
|[28.0,0.0,60968.0...|    0|
+--------------------+-----+
only showing top 20 rows

In [ ]:
(train_data, test_data) = customer.randomSplit([0.7, 0.3], seed=420)

dt = DecisionTreeClassifier(featuresCol="features")
evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")

paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10, 15]) \
    .addGrid(dt.maxBins, [20, 40, 60, 80]) \
    .build()


crossval = CrossValidator(estimator=dt,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)

cvModel = crossval.fit(train_data)

bestModel = cvModel.bestModel
predictions = bestModel.transform(test_data)
roc_auc = evaluator.evaluate(predictions)
print(f"ROC-AUC Score: {roc_auc}")
ROC-AUC Score: 0.8796711509715994
In [ ]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator_accuracy = MulticlassClassificationEvaluator(metricName="accuracy")
evaluator_precision = MulticlassClassificationEvaluator(metricName="weightedPrecision")
evaluator_recall = MulticlassClassificationEvaluator(metricName="weightedRecall")
evaluator_f1 = MulticlassClassificationEvaluator(metricName="f1")

accuracy = evaluator_accuracy.evaluate(predictions)
precision = evaluator_precision.evaluate(predictions)
recall = evaluator_recall.evaluate(predictions)
f1_score = evaluator_f1.evaluate(predictions)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
Accuracy: 0.8498402555910544
Precision: 0.8621285880890186
Recall: 0.8498402555910544
F1 Score: 0.8533440362635478
In [ ]:
regras = bestModel.toDebugString.replace("Predict: 1.0", "BUY_INSURANCE: Yes").replace("Predict: 0.0", "BUY_INSURANCE: No")
f = 0
used_features = []
for feature in feature_cols:
    regras = regras.replace(f"feature {f} ", f"{feature} ")
    if feature in regras:
        used_features.append(feature)
    f = f +1
In [ ]:
def parse_debug_string_lines(lines):
    
    block = []
    while lines:


        if lines[0].startswith('If'):
            bl = ' '.join(lines.pop(0).split()[1:]).replace('(', '').replace(')', '')
            block.append({'name': bl, 'children': parse_debug_string_lines(lines)})


            if lines[0].startswith('Else'):
                be = ' '.join(lines.pop(0).split()[1:]).replace('(', '').replace(')', '')
                block.append({'name': be, 'children': parse_debug_string_lines(lines)})
        elif not lines[0].startswith(('If', 'Else')):
            block2 = lines.pop(0)
            block.append({'name': block2})
        else:
            break
    
    return block


def debug_str_to_json(debug_string):
    data = []
    for line in debug_string.splitlines():
        if line.strip():
            line = line.strip()
            data.append(line)
        else:
            break
        if not line: break

    json = {'name': 'Root', 'children': parse_debug_string_lines(data[1:])}
    return json

import json
dict_tree_json = debug_str_to_json(regras)
# print(json.dumps(dict_tree_json,indent = 1 ))

https://jsoncrack.com/editor

In [ ]:
from IPython.display import SVG, display
def show_svg():
    display(SVG(filename='/home/glauco/staging/jsoncrack.com.svg'))
show_svg()
No description has been provided for this image
In [ ]:
#################################################
#Plotando a curva ROC
#################################################

from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
import matplotlib.pyplot as plt

# Define a UDF to extract the probability of the positive class
def extract_probability(probability_vector):
    return float(probability_vector[1])

extract_prob_udf = udf(extract_probability, FloatType())

# Update the DataFrame to include a column for the positive class probability
predictions = predictions.withColumn("positive_probability", extract_prob_udf("probability"))

# Now compute the ROC curve
thresholds = [i / 100.0 for i in range(100)]  # Creating a list of threshold values from 0 to 1
roc_points = []

for threshold in thresholds:
    # Create a new column 'predicted_label' based on the probability threshold
    prediction_with_threshold = predictions.withColumn('predicted_label', (col("positive_probability") >= threshold).cast("integer"))

    # True Positive (TP): correctly predicted positive observations
    TP = prediction_with_threshold.filter('predicted_label = 1 AND label = 1').count()
    # False Positive (FP): incorrectly predicted positive observations
    FP = prediction_with_threshold.filter('predicted_label = 1 AND label = 0').count()
    # True Negative (TN): correctly predicted negative observations
    TN = prediction_with_threshold.filter('predicted_label = 0 AND label = 0').count()
    # False Negative (FN): incorrectly predicted negative observations
    FN = prediction_with_threshold.filter('predicted_label = 0 AND label = 1').count()
    
    # Calculate True Positive Rate (TPR) and False Positive Rate (FPR)
    TPR = TP / (TP + FN) if (TP + FN) else 0
    FPR = FP / (FP + TN) if (FP + TN) else 0
    
    roc_points.append((FPR, TPR))

# Extract FPR and TPR into separate lists for plotting
fpr, tpr = zip(*roc_points)

# Plot ROC curve using Matplotlib
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image